library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
library(gapminder)
library(plotly)
## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout

Question1

mydata <- read.csv("/Users/rohin/Desktop/NYRestaurantInspection2022.csv")
as_tibble(mydata)
## # A tibble: 240,610 × 27
##       CAMIS DBA          BORO  BUILDING STREET ZIPCODE PHONE CUISINE.DESCRIPTION
##       <int> <chr>        <chr> <chr>    <chr>  <chr>   <chr> <chr>              
##  1 50117016 "NO CHEWING… Manh… 1802     65TH … ""      9178… ""                 
##  2 50116677 "MAMAN"      Broo… 154      COURT… "11201" 7184… ""                 
##  3 50126777 "TANG MUSIC… Quee… 5530     58TH … "11378" 3478… ""                 
##  4 50111558 "LEGENDS HO… Manh… 1        INTRE… ""      9172… ""                 
##  5 50116856 "LA PECORA … Manh… 817      WASHI… "10014" 2128… ""                 
##  6 50108825 "AMPLE HILL… Broo… 1        WATER… "11201" 3478… ""                 
##  7 50075103 "PQR"        Manh… 1631     2 AVE… "10028" 9174… "Pizza"            
##  8 50127765 ""           Manh… 695      PARK … "10065" 3478… ""                 
##  9 50124776 ""           Manh… 12       PARK … "10016" 9292… ""                 
## 10 50110734 "Chelsea Ta… Manh… 152      WEST … "10001" 9175… ""                 
## # ℹ 240,600 more rows
## # ℹ 19 more variables: INSPECTION.DATE <chr>, ACTION <chr>,
## #   VIOLATION.CODE <chr>, VIOLATION.DESCRIPTION <chr>, CRITICAL.FLAG <chr>,
## #   SCORE <int>, GRADE <chr>, GRADE.DATE <chr>, RECORD.DATE <chr>,
## #   INSPECTION.TYPE <chr>, Latitude <dbl>, Longitude <dbl>,
## #   Community.Board <int>, Council.District <int>, Census.Tract <int>,
## #   BIN <int>, BBL <dbl>, NTA <chr>, Location.Point <lgl>
mydata1 <- mydata %>% filter(BORO=="Queens", CUISINE.DESCRIPTION=="Pizza")
mydata1 %>% group_by(DBA) %>% summarize(count=n()) %>% arrange(desc(count)) %>% slice(1:5)
## # A tibble: 5 × 2
##   DBA               count
##   <chr>             <int>
## 1 DOMINO'S            130
## 2 PAPA JOHN'S PIZZA    69
## 3 PAPA JOHN'S          68
## 4 DOMINOS              48
## 5 ROSA'S PIZZA         48
mydata1%>%filter( DBA=="SUSANO'S PIZZERIA & RESTAURANT")%>%group_by(INSPECTION.DATE)%>% select(c(INSPECTION.DATE))%>%print(n=Inf)
## # A tibble: 17 × 1
## # Groups:   INSPECTION.DATE [5]
##    INSPECTION.DATE
##    <chr>          
##  1 07/31/2019     
##  2 07/31/2019     
##  3 07/31/2019     
##  4 05/05/2022     
##  5 12/09/2019     
##  6 07/31/2019     
##  7 08/14/2019     
##  8 07/31/2019     
##  9 07/31/2019     
## 10 01/08/2020     
## 11 05/05/2022     
## 12 05/05/2022     
## 13 12/09/2019     
## 14 12/09/2019     
## 15 12/09/2019     
## 16 01/08/2020     
## 17 05/05/2022

Question2

mydata2 <- read.delim("/Users/rohin/Desktop/gapminder_2007_gini.tsv")
mydata2 %>% ggplot() + geom_boxplot(aes(continent,gini,color=continent),outlier.colour="red", outlier.shape=16,outlier.size=3, notch=FALSE)+ggtitle("Gini Index in all continents")

ggplotly()
mydata2%>% ggplot(aes(gini, lifeExp, color = continent,size=pop,label=country)) + geom_point() +ggtitle("life expectancy V gini index")+ facet_wrap(~continent)

ggplotly()
mydata2%>%group_by(continent)%>% summarize(minimum = min(gini, na.rm = TRUE), maximum = max(gini, na.rm = TRUE), mean = mean(gini, na.rm = TRUE))
## # A tibble: 5 × 4
##   continent minimum maximum  mean
##   <chr>       <dbl>   <dbl> <dbl>
## 1 Africa       30.8    63.2  43.9
## 2 Americas     32.1    60.8  48.2
## 3 Asia         29.6    49    40.2
## 4 Europe       23.7    40.2  30.5
## 5 Oceania      30.3    36.2  33.2

#(2b) There is a clear correlation between life expectancy and gini index. From the plot we can see that countries with lower gini index have higher life expectancy rate and countries with higher gini index have lower life expectancy rate.

Question 3

gdp1 <- mutate(gapminder, gdp = pop*gdpPercap)
head(gdp1)
## # A tibble: 6 × 7
##   country     continent  year lifeExp      pop gdpPercap          gdp
##   <fct>       <fct>     <int>   <dbl>    <int>     <dbl>        <dbl>
## 1 Afghanistan Asia       1952    28.8  8425333      779.  6567086330.
## 2 Afghanistan Asia       1957    30.3  9240934      821.  7585448670.
## 3 Afghanistan Asia       1962    32.0 10267083      853.  8758855797.
## 4 Afghanistan Asia       1967    34.0 11537966      836.  9648014150.
## 5 Afghanistan Asia       1972    36.1 13079460      740.  9678553274.
## 6 Afghanistan Asia       1977    38.4 14880372      786. 11697659231.
usa_gdp <- gdp1%>%filter(country=="United States",year==2007)
gdp2<- mutate(gdp1, gdp_ratio = gdp/usa_gdp$gdp)
head(gdp2)
## # A tibble: 6 × 8
##   country     continent  year lifeExp      pop gdpPercap          gdp gdp_ratio
##   <fct>       <fct>     <int>   <dbl>    <int>     <dbl>        <dbl>     <dbl>
## 1 Afghanistan Asia       1952    28.8  8425333      779.  6567086330.  0.000508
## 2 Afghanistan Asia       1957    30.3  9240934      821.  7585448670.  0.000586
## 3 Afghanistan Asia       1962    32.0 10267083      853.  8758855797.  0.000677
## 4 Afghanistan Asia       1967    34.0 11537966      836.  9648014150.  0.000746
## 5 Afghanistan Asia       1972    36.1 13079460      740.  9678553274.  0.000748
## 6 Afghanistan Asia       1977    38.4 14880372      786. 11697659231.  0.000904
gdp3 <- gdp2 %>% group_by(continent, year) %>% summarize(median = median(gdp_ratio))
## `summarise()` has grouped output by 'continent'. You can override using the
## `.groups` argument.
gdp3 %>% ggplot(aes(year, median, color = continent)) + geom_point() + geom_line()+ggtitle("Median V GDP Ratio")

ggplotly()